RNA-seq analysis

# Load libraries and custom functions -----------------------------------------
library(easypackages)
libraries("here","ggplot2","WGCNA","limma","statmod","edgeR","biomaRt",
          "sva","patchwork","Biobase","reshape2","genefilter","variancePartition",
          "BiocParallel","tidyverse","gplots","enrichR","ggrepel","glue","ggtext")
source(here("code","utils.R"))

# Various parameters to set ---------------------------------------------------
# set FDR thresh
fdr_qThresh = 0.05
minReads = 100
varianceFilterCutoff = 0.15
nsv = 2

# Paths -----------------------------------------------------------------------
# paths
root_path = here()
data_path = here("data")
code_path = here("code")
plot_path = here("plots")
result_path = here("results")
res_path = file.path(result_path,"filt","sva")

# load data -------------------------------------------------------------------
# read count data
reads_file = file.path(data_path,"tidy_reads.csv")
reads_data = read.csv(reads_file)

# sample meta data
meta_file = file.path(data_path,"tidy_metadata.csv")
meta_data = read.csv(meta_file)
colnames(reads_data)[4:ncol(reads_data)] = meta_data$sample_id

# mouse gene meta data - from utils::getGeneMetaData
gene_metadata_file = file.path(data_path,"tidy_gene_meta_data.csv")
gene_meta_data = read.csv(gene_metadata_file,row.names = 1)

# mouse to human homolog gene meta data - from utils::getHumanHomolog
hsap_homolog_metadata_file = file.path(data_path,"tidy_human_homolog_gene_meta_data.csv")
hsap_homolog_gene_meta_data = read.csv(hsap_homolog_metadata_file)

# set up expression and meta data matrices ------------------------------------

# PFC data
pfc_names = meta_data$sample_id[meta_data$brain_region=="PFC"]
exprs_pfc = reads_data[,pfc_names]
rownames(exprs_pfc) = reads_data$ensembl_id
meta_data_pfc = subset(meta_data, meta_data$brain_region=="PFC")

# HYT 
hyt_names = meta_data$sample_id[meta_data$brain_region=="HYT"]
exprs_hyt = reads_data[,hyt_names]
rownames(exprs_hyt) = reads_data$ensembl_id
meta_data_hyt = subset(meta_data, meta_data$brain_region=="HYT")

Preprocessing

### PCA to summarize the Picard sequencing variables --------------------------
npcs2use = 10
seq_vars2use = c("PCT_CODING_BASES","PCT_UTR_BASES","PCT_INTRONIC_BASES",
                 "PCT_INTERGENIC_BASES","MEDIAN_CV_COVERAGE","MEDIAN_5PRIME_BIAS",
                 "ALIGNED_READS","AT_DROPOUT")

# PFC data
pca_res_seq_pfc = prcomp(meta_data_pfc[,seq_vars2use], center = TRUE, scale = TRUE)
meta_data_pfc = cbind(meta_data_pfc, pca_res_seq_pfc$x)
summary(pca_res_seq_pfc)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5    PC6     PC7
## Standard deviation     1.8701 1.2636 1.2411 0.89062 0.71602 0.1980 0.13927
## Proportion of Variance 0.4372 0.1996 0.1925 0.09915 0.06409 0.0049 0.00242
## Cumulative Proportion  0.4372 0.6368 0.8293 0.92846 0.99254 0.9974 0.99987
##                            PC8
## Standard deviation     0.03253
## Proportion of Variance 0.00013
## Cumulative Proportion  1.00000
# HYT data
pca_res_seq_hyt = prcomp(meta_data_hyt[,seq_vars2use], center = TRUE, scale = TRUE)
meta_data_hyt = cbind(meta_data_hyt, pca_res_seq_hyt$x)
summary(pca_res_seq_hyt)
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     1.778 1.4909 1.0176 0.8314 0.80445 0.46944 0.1522
## Proportion of Variance 0.395 0.2778 0.1294 0.0864 0.08089 0.02755 0.0029
## Cumulative Proportion  0.395 0.6728 0.8023 0.8887 0.96955 0.99710 1.0000
##                             PC8
## Standard deviation     0.005957
## Proportion of Variance 0.000000
## Cumulative Proportion  1.000000
### Filtering ------------------------------------------------------------------

# PFC only 
genes2keep = rowSums(exprs_pfc>minReads)>=2

# genes to throw out
tossedOutData = exprs_pfc[!genes2keep,]

# grab only genes to keep
exprs_filt_pfc = exprs_pfc[genes2keep,]

sprintf("Genes before filtering out low reads = %d",dim(exprs_pfc)[1])
## [1] "Genes before filtering out low reads = 54165"
sprintf("Genes after filtering out low reads = %d",dim(exprs_filt_pfc)[1])
## [1] "Genes after filtering out low reads = 14506"
sprintf("Max reads for any gene tossed out = %d",max(as.matrix(tossedOutData)))
## [1] "Max reads for any gene tossed out = 212"
sprintf("Mean reads for any gene tossed out = %f",mean(as.matrix(tossedOutData)))
## [1] "Mean reads for any gene tossed out = 4.012234"
sprintf("Median reads for any gene tossed out = %f",median(as.matrix(tossedOutData)))
## [1] "Median reads for any gene tossed out = 0.000000"
genes_before = dim(exprs_filt_pfc)[1]
exprs_filt_pfc = varFilter(as.matrix(exprs_filt_pfc), var.cutoff = varianceFilterCutoff)
genes_after = dim(exprs_filt_pfc)[1]

print(sprintf("Genes before variance filtering at %.02f = %d",varianceFilterCutoff,genes_before))
## [1] "Genes before variance filtering at 0.15 = 14506"
print(sprintf("Genes after variance filtering at %.02f = %d",varianceFilterCutoff,genes_after))
## [1] "Genes after variance filtering at 0.15 = 12294"
# HYT only 
genes2keep = rowSums(exprs_hyt>minReads)>=2

# genes to throw out
tossedOutData = exprs_hyt[!genes2keep,]

# grab only genes to keep
exprs_filt_hyt = exprs_hyt[genes2keep,]

sprintf("Genes before filtering out low reads = %d",dim(exprs_hyt)[1])
## [1] "Genes before filtering out low reads = 54165"
sprintf("Genes after filtering out low reads = %d",dim(exprs_filt_hyt)[1])
## [1] "Genes after filtering out low reads = 16155"
sprintf("Max reads for any gene tossed out = %d",max(as.matrix(tossedOutData)))
## [1] "Max reads for any gene tossed out = 643"
sprintf("Mean reads for any gene tossed out = %f",mean(as.matrix(tossedOutData)))
## [1] "Mean reads for any gene tossed out = 4.337044"
sprintf("Median reads for any gene tossed out = %f",median(as.matrix(tossedOutData)))
## [1] "Median reads for any gene tossed out = 0.000000"
genes_before = dim(exprs_filt_hyt)[1]
exprs_filt_hyt = varFilter(as.matrix(exprs_filt_hyt), var.cutoff = varianceFilterCutoff)
genes_after = dim(exprs_filt_hyt)[1]

print(sprintf("Genes before variance filtering at %.02f = %d",varianceFilterCutoff,genes_before))
## [1] "Genes before variance filtering at 0.15 = 16155"
print(sprintf("Genes after variance filtering at %.02f = %d",varianceFilterCutoff,genes_after))
## [1] "Genes after variance filtering at 0.15 = 13727"
### Scale data by library size ------------------------------------------------

# turn into DGEList object
dge_pfc = DGEList(counts=exprs_filt_pfc)
dge_hyt = DGEList(counts=exprs_filt_hyt)

# Calculate normalization factors to scale the raw library sizes.
dgeN_pfc = calcNormFactors(dge_pfc, method="TMM")
dgeN_hyt = calcNormFactors(dge_hyt, method="TMM")


### Run voom to get log-CPM and precision weights for DE model ----------------

# PFC only --------------------------------------------------------------------
# make design matrix for modeling with voom
form2use = ~ group*sex + RIN
design_pfc = model.matrix(form2use, data = meta_data_pfc)

# use voom to calculate precision weights plus output count data as logCPM
v_pfc = voom(dgeN_pfc, design_pfc, plot=TRUE)

voomed_edata_pfc = v_pfc$E
voomed_precision_weights_pfc = v_pfc$weights

# HYT only --------------------------------------------------------------------
# make design matrix for modeling with voom
form2use = ~ group*sex + RIN
design_hyt = model.matrix(form2use, data = meta_data_hyt)

# use voom to calculate precision weights plus output count data as logCPM
v_hyt = voom(dgeN_hyt, design_hyt, plot=TRUE)

voomed_edata_hyt = v_hyt$E
voomed_precision_weights_hyt = v_hyt$weights


### Plot final preprocessed data ----------------------------------------------

# PFC plot data
x = melt(as.matrix(v_pfc$E))
colnames(x) = c('gene_name', 'sample_id', 'value')
ggplot(x, aes(x=value, color=sample_id)) + geom_density()

# HYT plot data
x = melt(as.matrix(v_hyt$E))
colnames(x) = c('gene_name', 'sample_id', 'value')
ggplot(x, aes(x=value, color=sample_id)) + geom_density()

### PCA on preprocessed expression data ---------------------------------------

# PFC -------------------------------------------------------------------------
pca_expr_pfc = prcomp(t(v_pfc$E), center = TRUE, scale = TRUE)  
summary(pca_expr_pfc)
## Importance of components:
##                            PC1     PC2     PC3      PC4      PC5      PC6
## Standard deviation     67.3607 45.0851 35.1122 29.41617 24.55718 23.05712
## Proportion of Variance  0.3691  0.1653  0.1003  0.07038  0.04905  0.04324
## Cumulative Proportion   0.3691  0.5344  0.6347  0.70508  0.75414  0.79738
##                             PC7      PC8      PC9     PC10    PC11    PC12
## Standard deviation     19.89234 18.68826 18.33707 17.32292 13.7168 13.0248
## Proportion of Variance  0.03219  0.02841  0.02735  0.02441  0.0153  0.0138
## Cumulative Proportion   0.82957  0.85798  0.88533  0.90973  0.9250  0.9388
##                            PC13     PC14     PC15     PC16    PC17    PC18
## Standard deviation     12.25960 11.50353 10.54770 10.03052 9.93079 9.23555
## Proportion of Variance  0.01223  0.01076  0.00905  0.00818 0.00802 0.00694
## Cumulative Proportion   0.95106  0.96183  0.97088  0.97906 0.98708 0.99402
##                           PC19      PC20
## Standard deviation     8.57407 1.974e-13
## Proportion of Variance 0.00598 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00
# HYT -------------------------------------------------------------------------
pca_expr_hyt = prcomp(t(v_hyt$E), center = TRUE, scale = TRUE)  
summary(pca_expr_hyt)
## Importance of components:
##                            PC1     PC2      PC3      PC4      PC5     PC6
## Standard deviation     82.2350 43.7813 36.12868 28.92985 23.75485 22.4755
## Proportion of Variance  0.4926  0.1396  0.09509  0.06097  0.04111  0.0368
## Cumulative Proportion   0.4926  0.6323  0.72737  0.78834  0.82945  0.8662
##                             PC7      PC8      PC9     PC10    PC11     PC12
## Standard deviation     17.94329 15.82675 14.89912 13.87575 12.7791 12.18479
## Proportion of Variance  0.02345  0.01825  0.01617  0.01403  0.0119  0.01082
## Cumulative Proportion   0.88971  0.90795  0.92413  0.93815  0.9500  0.96086
##                            PC13    PC14     PC15    PC16    PC17      PC18
## Standard deviation     11.11135 10.9901 10.28565 9.97009 9.36854 2.175e-13
## Proportion of Variance  0.00899  0.0088  0.00771 0.00724 0.00639 0.000e+00
## Cumulative Proportion   0.96986  0.9787  0.98636 0.99361 1.00000 1.000e+00
### Surrogate Variable Analysis (SVA) -----------------------------------------

# basic model before SVA
svaform2use = ~ group*sex + RIN


# PFC 
sva_res_pfc = runSVA(expr_data = v_pfc$E, 
                       meta_data = meta_data_pfc, 
                       form2use = svaform2use, 
                       nsv = nsv, 
                       svestMethod = "leek")
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
# new meta_data data frame with the SVs
meta_data_pfc = sva_res_pfc$meta_data
# new model to use later in DE model, which incorporates SVs
form4model_pfc = sva_res_pfc$form2use

# HYT
sva_res_hyt = runSVA(expr_data = v_hyt$E, 
                       meta_data = meta_data_hyt, 
                       form2use = svaform2use, 
                       nsv = nsv, 
                       svestMethod = "leek")
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
# new meta_data data frame with the SVs
meta_data_hyt = sva_res_hyt$meta_data
# new model to use later in DE model, which incorporates SVs
form4model_hyt = sva_res_hyt$form2use



### Plot heatmaps -------------------------------------------------------------

# PFC 
# plot correlations of PCA of expression data against PCA of sequencing-related variables 
r_mat = cor(pca_expr_pfc$x, pca_res_seq_pfc$x)
heatmap.2(r_mat,
          col = blueWhiteRed(50),
          Rowv = TRUE, 
          trace = "none",
          cellnote= round(as.matrix(r_mat),digits = 2), 
          notecol = "black",
          notecex=0.9,
          density.info = "none", 
          key.xlab ="Correlation (r)")

print(r_mat)
##               PC1          PC2          PC3         PC4         PC5
## PC1  -0.932530344  0.206630234  0.016102017 -0.11210853  0.07612601
## PC2  -0.197570167 -0.394090715 -0.096729275 -0.52226418  0.06251997
## PC3  -0.108667842 -0.728316401  0.410460491  0.03780872 -0.18791135
## PC4  -0.031634980 -0.278681139 -0.837069125 -0.06793425 -0.17243203
## PC5   0.083252803  0.019612289  0.196228354 -0.35416420 -0.51334731
## PC6  -0.012317975  0.239402929 -0.060505908 -0.07705545 -0.60396481
## PC7   0.156028198  0.260859904  0.078149293 -0.57357412  0.07723189
## PC8  -0.013290741  0.066576191 -0.046141700 -0.09895332 -0.16874454
## PC9  -0.130492844 -0.006894644  0.148871305  0.17870935  0.02934789
## PC10  0.007680893 -0.081190818  0.093785108 -0.14244978 -0.02599418
## PC11  0.017728460 -0.070512653 -0.102768376  0.06906079  0.10878710
## PC12 -0.144109103  0.108047117 -0.003641237  0.28851615 -0.37338938
## PC13 -0.009526943 -0.145772386  0.004459942  0.02580745 -0.12655331
## PC14  0.030428558 -0.071917351 -0.041471137  0.05691040  0.12089484
## PC15  0.024580982  0.104299973  0.104495285 -0.09236672 -0.03431177
## PC16 -0.068858030 -0.029158501 -0.012454110  0.26206314 -0.22579187
## PC17  0.045150045  0.005761638 -0.009688141 -0.09688062  0.14573053
## PC18  0.013872361  0.024641643  0.095583356  0.00615551 -0.02527730
## PC19 -0.013308579 -0.040414263 -0.085098025  0.08775646 -0.07110561
## PC20 -0.575070009  0.440271494  0.222988608  0.21071857 -0.10951222
##               PC6          PC7          PC8
## PC1   0.091982868 -0.089207144  0.188140339
## PC2  -0.111683955  0.394129879 -0.518584019
## PC3   0.171057341 -0.400181852  0.099736953
## PC4  -0.153960197 -0.217705885  0.214794728
## PC5  -0.398157620  0.006739672  0.213603847
## PC6   0.359580139 -0.034522400 -0.055037109
## PC7   0.165682132 -0.253410471 -0.152794869
## PC8   0.105776700 -0.110561434 -0.007013627
## PC9  -0.578568989 -0.017525376 -0.093255895
## PC10  0.166320380  0.091444607  0.157405954
## PC11  0.147382328 -0.180079910 -0.123893927
## PC12 -0.134577806 -0.052025825 -0.444696626
## PC13  0.201340415  0.572037855  0.241181685
## PC14  0.330525732 -0.044068921 -0.164972351
## PC15 -0.155537558 -0.145752001 -0.015556238
## PC16  0.138191885  0.083032915 -0.306982124
## PC17 -0.052988134 -0.271617818  0.073204129
## PC18 -0.009118173  0.262035016  0.359289870
## PC19  0.037848771  0.089071026  0.081534010
## PC20  0.090215664 -0.244510172  0.126172966
# plot correlations of PCA of expression data against SVs from SVA 
r_mat = cor(pca_expr_pfc$x, sva_res_pfc$meta_data[,sva_res_pfc$sv_names])
heatmap.2(r_mat,
          col = blueWhiteRed(50),
          Rowv = TRUE, 
          trace = "none",
          cellnote= round(as.matrix(r_mat),digits = 2), 
          notecol = "black",
          notecex=0.9,
          density.info = "none", 
          key.xlab ="Correlation (r)")

print(r_mat)
##             sv001         sv002
## PC1  -0.965997716  5.726108e-02
## PC2  -0.233003558 -5.155497e-01
## PC3  -0.076031693  7.803486e-01
## PC4  -0.037159016 -1.420795e-01
## PC5   0.049928542 -2.523472e-01
## PC6   0.010249353 -6.459147e-02
## PC7  -0.019557375  7.776587e-02
## PC8  -0.013515837 -1.358480e-01
## PC9  -0.011993882 -7.442180e-03
## PC10 -0.025400560 -6.370417e-02
## PC11  0.015998992 -2.426434e-02
## PC12 -0.027195275 -1.424549e-02
## PC13 -0.003551633  4.739788e-02
## PC14 -0.004703281 -2.630655e-06
## PC15 -0.008238460 -1.547988e-02
## PC16 -0.013855760  1.160439e-03
## PC17 -0.010070489 -4.062048e-02
## PC18  0.006465747  1.153551e-02
## PC19  0.003265470  1.642619e-02
## PC20 -0.550062806  2.239853e-01
# plot correlations of PCA of sequencing-related variabales against SVs from SVA 
r_mat = cor(pca_res_seq_pfc$x, sva_res_pfc$meta_data[,sva_res_pfc$sv_names])
heatmap.2(r_mat,
          col = blueWhiteRed(50),
          Rowv = TRUE, 
          trace = "none",
          cellnote= round(as.matrix(r_mat),digits = 2), 
          notecol = "black",
          notecex=0.9,
          density.info = "none", 
          key.xlab ="Correlation (r)")

print(r_mat)
##            sv001       sv002
## PC1  0.963259621 -0.03881844
## PC2 -0.046169244 -0.32668724
## PC3  0.009368594  0.45090031
## PC4  0.216524137  0.37554106
## PC5 -0.084828926  0.03848017
## PC6 -0.084553430  0.30079839
## PC7  0.038657053 -0.44940866
## PC8 -0.049907205  0.27799831
# HYT 
# plot correlations of PCA of expression data against PCA of sequencing-related variables 
r_mat = cor(pca_expr_hyt$x, pca_res_seq_hyt$x)
heatmap.2(r_mat,
          col = blueWhiteRed(50),
          Rowv = TRUE, 
          trace = "none",
          cellnote= round(as.matrix(r_mat),digits = 2), 
          notecol = "black",
          notecex=0.9,
          density.info = "none", 
          key.xlab ="Correlation (r)")

print(r_mat)
##               PC1          PC2          PC3          PC4         PC5
## PC1   0.784704716  0.413463608  0.223287556  0.099196015  0.29934726
## PC2   0.521416651 -0.379305442  0.006808786 -0.500834841 -0.29793439
## PC3   0.165532526 -0.626684706  0.009154236  0.575401424  0.32243093
## PC4   0.068444213 -0.234702393 -0.243232631 -0.091239041  0.39549956
## PC5   0.225067608 -0.165458095 -0.412063055  0.164686318 -0.54422429
## PC6  -0.115176406 -0.288284416  0.284418614 -0.536105506  0.27964917
## PC7   0.032131498 -0.263848227  0.143968595  0.012377105 -0.12907961
## PC8   0.002385271 -0.073576046 -0.192517518  0.011236326  0.22034722
## PC9  -0.070660139 -0.078741957  0.578039001  0.215834425 -0.20350873
## PC10  0.069050904  0.121554726 -0.128354513  0.122868979 -0.08295568
## PC11  0.020229528 -0.133775436  0.040870258 -0.001287308  0.02573071
## PC12  0.033164722 -0.087562715  0.118598102  0.052409139 -0.13849559
## PC13  0.003660998 -0.000490014 -0.116909781 -0.034211956 -0.01844659
## PC14  0.023103740 -0.064013518  0.433233300  0.111890023 -0.21751614
## PC15  0.007158223  0.025768246 -0.038898264 -0.062434930  0.04593845
## PC16 -0.057584623  0.022550220  0.107528880  0.022698637  0.03584778
## PC17  0.011574642  0.005766781 -0.039951902 -0.050697752  0.04956847
## PC18 -0.777728317 -0.501256827 -0.216249777 -0.047290663 -0.09914285
##              PC6          PC7         PC8
## PC1  -0.23493764  0.023157747 -0.02530176
## PC2   0.40268124 -0.061494252  0.18819895
## PC3   0.08414573 -0.205528609 -0.07214646
## PC4   0.24583874  0.440454521 -0.18830190
## PC5  -0.43437525  0.269980416 -0.24689257
## PC6  -0.39922334 -0.041317333 -0.27179056
## PC7  -0.42086345 -0.431752375  0.20022350
## PC8  -0.21553014  0.227359183  0.44582422
## PC9   0.07191374  0.385723230  0.16124718
## PC10  0.23975048 -0.314765384 -0.23666417
## PC11 -0.22711545  0.220207537  0.04242660
## PC12 -0.01107585  0.219718033  0.05527709
## PC13 -0.11134794 -0.124293977 -0.10536961
## PC14  0.11366350  0.058440097 -0.39925336
## PC15 -0.01081979 -0.127849959 -0.43023658
## PC16 -0.02510358  0.002605484 -0.08202267
## PC17 -0.05543935  0.258645421 -0.31685980
## PC18  0.10912374 -0.099229923 -0.02913442
# plot correlations of PCA of expression data against SVs from SVA 
r_mat = cor(pca_expr_hyt$x, sva_res_hyt$meta_data[,sva_res_hyt$sv_names])
heatmap.2(r_mat,
          col = blueWhiteRed(50),
          Rowv = TRUE, 
          trace = "none",
          cellnote= round(as.matrix(r_mat),digits = 2), 
          notecol = "black",
          notecex=0.9,
          density.info = "none", 
          key.xlab ="Correlation (r)")

print(r_mat)
##             sv001        sv002
## PC1  -0.997150824  0.030535137
## PC2   0.001812948  0.903455208
## PC3   0.056456984  0.355569358
## PC4   0.021728877  0.115039195
## PC5   0.008066996 -0.097539838
## PC6  -0.040977115 -0.170289840
## PC7  -0.003390796 -0.038900968
## PC8   0.002521880  0.029303384
## PC9   0.005768762 -0.037221874
## PC10 -0.009130765 -0.005338418
## PC11 -0.003805584 -0.009525330
## PC12 -0.007659704 -0.013844508
## PC13  0.005537246 -0.011185879
## PC14 -0.002576028 -0.012085309
## PC15 -0.004246093  0.014031111
## PC16  0.004282247 -0.011103142
## PC17  0.001374117  0.000620804
## PC18  0.945246875 -0.045107331
# plot correlations of PCA of sequencing-related variabales against SVs from SVA 
r_mat = cor(pca_res_seq_hyt$x, sva_res_hyt$meta_data[,sva_res_hyt$sv_names])
heatmap.2(r_mat,
          col = blueWhiteRed(50),
          Rowv = TRUE, 
          trace = "none",
          cellnote= round(as.matrix(r_mat),digits = 2), 
          notecol = "black",
          notecex=0.9,
          density.info = "none", 
          key.xlab ="Correlation (r)")

print(r_mat)
##           sv001       sv002
## PC1 -0.76592814  0.56028644
## PC2 -0.44249344 -0.50088918
## PC3 -0.24110988 -0.05979398
## PC4 -0.04649388 -0.19180828
## PC5 -0.28606224 -0.07014234
## PC6  0.25695695  0.53356455
## PC7 -0.01667900 -0.09189486
## PC8  0.03098848  0.19235500

PFC DE modeling

de_res_pfc = runDEmodel(expr_data = v_pfc, meta_data = meta_data_pfc, 
                        form2use = form4model_pfc,
                        gene_meta_data = gene_meta_data)

# save tables
brain_region = "pfc"
fstem = sprintf("_filt%0.2f",varianceFilterCutoff)

table_names2use = c("Fstat","interaction","group","sex")
report2screen = sprintf("%s DE genes\n",brain_region)
for (itable in 1:length(table_names2use)){
  fname2save = file.path(res_path,
                         brain_region,
                         sprintf("DE.%s_table%s.csv",
                                 table_names2use[itable],
                                 fstem))
  tab2use = de_res_pfc[[sprintf("%s_table",table_names2use[itable])]]
  tab2use = tab2use[order(tab2use$adj.P.Val),]
  tab2use$DE = "NO"
  tab2use$DE[tab2use$adj.P.Val<=fdr_qThresh] = "YES"
  write.csv(tab2use, file = fname2save)
  if (itable>1){
    report2screen = c(report2screen, 
                      sprintf("%s DE genes is %d\n",
                              table_names2use[itable],
                              sum(tab2use$adj.P.Val<=fdr_qThresh)))
  }
}
cat(report2screen)
## pfc DE genes
##  interaction DE genes is 2629
##  group DE genes is 1
##  sex DE genes is 2311
# plot percentage of DE genes on each chromosome ------------------------------
gene_meta_data_sub = subset(gene_meta_data, 
                            is.element(gene_meta_data$ensembl_gene_id, 
                                       rownames(exprs_filt_pfc)))

# DE-
tmp_df = de_res_pfc[["interaction_table"]]
tmp_df = subset(tmp_df, tmp_df$adj.P.Val<fdr_qThresh & tmp_df$logFC>0)
chr_res = chromosome_perm(gene_meta_data_sub, 
                          de_df = tmp_df, 
                          nperm=10000, 
                          fdr_qThresh=fdr_qThresh)
chr_res
##    Chromosome    percent       pval        fdr
## 1           1 0.16462585 0.00049995 0.00199980
## 10         10 0.13220339 0.22497750 0.48355164
## 11         11 0.08659794 0.99970003 1.00000000
## 12         12 0.13465784 0.20397960 0.48355164
## 13         13 0.18461538 0.00009999 0.00049995
## 14         14 0.15853659 0.01469853 0.04899510
## 15         15 0.07662083 0.99950005 1.00000000
## 16         16 0.13589744 0.19948005 0.48355164
## 17         17 0.08818342 0.99520048 1.00000000
## 18         18 0.21153846 0.00009999 0.00049995
## 19         19 0.08857809 0.98880112 1.00000000
## 2           2 0.11758794 0.66523348 1.00000000
## 3           3 0.17815126 0.00009999 0.00049995
## 4           4 0.10039630 0.97220278 1.00000000
## 5           5 0.10532838 0.93480652 1.00000000
## 6           6 0.13114754 0.24177582 0.48355164
## 7           7 0.07464325 1.00000000 1.00000000
## 8           8 0.08137715 0.99980002 1.00000000
## 9           9 0.10780142 0.88661134 1.00000000
## X           X 0.20095694 0.00009999 0.00049995
fontSize=30
p = ggplot(data = chr_res, aes(x = reorder(Chromosome,percent), y = percent, fill=-log10(pval))) + 
  geom_bar(stat="identity") + 
  scale_fill_gradientn(colors = colorRampPalette(c("grey60","#ff8d1e"))(100), 
                     limits = c(min(-log10(chr_res$pval)),max(-log10(chr_res$pval)))) +
  geom_hline(yintercept = min(chr_res[chr_res$fdr<=fdr_qThresh,"percent"])-0.01, linetype="dashed") +
  xlab("Chromosome") + ylab("Percentage DE genes") +
  coord_flip() + 
    theme(
    axis.text.x = element_text(size=fontSize+10),
    axis.text.y = element_text(size=fontSize+10),
    axis.title.x = element_text(size=fontSize+10),
    strip.text.x = element_text(size=fontSize+10),
    axis.title.y = element_text(size=fontSize+10))
ggsave(filename = file.path(plot_path, "DE-.interaction.pfc.chromosomePlot.pdf"),
       width=18, height=12)
p

# DE+
tmp_df = de_res_pfc[["interaction_table"]]
tmp_df = subset(tmp_df, tmp_df$adj.P.Val<fdr_qThresh & tmp_df$logFC<0)
chr_res = chromosome_perm(gene_meta_data_sub, 
                          de_df = tmp_df, 
                          nperm=10000, 
                          fdr_qThresh=fdr_qThresh)
chr_res
##    Chromosome    percent       pval        fdr
## 1           1 0.06394558 0.99820018 1.00000000
## 10         10 0.08474576 0.75882412 1.00000000
## 11         11 0.12371134 0.00039996 0.00399960
## 12         12 0.04415011 0.99990001 1.00000000
## 13         13 0.04175824 1.00000000 1.00000000
## 14         14 0.08048780 0.81831817 1.00000000
## 15         15 0.12573674 0.00659934 0.02199780
## 16         16 0.07435897 0.91040896 1.00000000
## 17         17 0.13051146 0.00169983 0.00849915
## 18         18 0.13461538 0.00399960 0.01599840
## 19         19 0.13752914 0.00079992 0.00533280
## 2           2 0.10351759 0.11088891 0.27722228
## 3           3 0.03529412 1.00000000 1.00000000
## 4           4 0.09247028 0.51014899 1.00000000
## 5           5 0.10904585 0.04979502 0.14227149
## 6           6 0.06721311 0.98920108 1.00000000
## 7           7 0.13721186 0.00009999 0.00199980
## 8           8 0.09546166 0.40415958 0.89813241
## 9           9 0.06950355 0.98760124 1.00000000
## X           X 0.03110048 1.00000000 1.00000000
fontSize=30
p = ggplot(data = chr_res, aes(x = reorder(Chromosome,percent), y = percent, fill=-log10(pval))) + 
  geom_bar(stat="identity") + 
  scale_fill_gradientn(colors = colorRampPalette(c("grey60","#1e90ff"))(100), 
                     limits = c(min(-log10(chr_res$pval)),max(-log10(chr_res$pval)))) +
  geom_hline(yintercept = min(chr_res[chr_res$fdr<=fdr_qThresh,"percent"])-0.01, linetype="dashed") +
  xlab("Chromosome") + ylab("Percentage DE genes") +
  coord_flip() + 
    theme(
    axis.text.x = element_text(size=fontSize+10),
    axis.text.y = element_text(size=fontSize+10),
    axis.title.x = element_text(size=fontSize+10),
    strip.text.x = element_text(size=fontSize+10),
    axis.title.y = element_text(size=fontSize+10))
ggsave(filename = file.path(plot_path, "DE+.interaction.pfc.chromosomePlot.pdf"),width=18, height=12)
p

# make a volcano plot

# interaction volcano plot
data2use = de_res_pfc[["interaction_table"]]
fname2save = file.path(plot_path, sprintf("DE.pfc.interaction.volcanoPlot%s.pdf",fstem))
colors2use = c("#ff8d1e","grey60","#1e90ff")
p = volcanoPlot(data2use = data2use, 
                fname2save = fname2save, 
                fdr_qThresh = fdr_qThresh, 
                plotTopGenes=FALSE,
                colors2use=colors2use, 
                fontSize=40, 
                dotSize=1, 
                plotWidth=10, 
                plotHeight=8)
p

# sex volcano plot
data2use = de_res_pfc[["sex_table"]]
fname2save = file.path(plot_path, sprintf("DE.pfc.sex.volcanoPlot%s.pdf",fstem))
colors2use = c("#ff8d1e","grey60","#1e90ff")
p = volcanoPlot(data2use = data2use, 
                fname2save = fname2save, 
                fdr_qThresh = fdr_qThresh, 
                plotTopGenes=TRUE,
                colors2use=colors2use, 
                fontSize=40, 
                dotSize=1, 
                plotWidth=10, 
                plotHeight=8)
p

# group volcano plot
data2use = de_res_pfc[["group_table"]]
fname2save = file.path(plot_path, sprintf("DE.pfc.group.volcanoPlot%s.pdf",fstem))
colors2use = c("grey60","#1e90ff")
p = volcanoPlot(data2use = data2use, 
                fname2save = fname2save, 
                fdr_qThresh = fdr_qThresh, 
                plotTopGenes=TRUE,
                colors2use=colors2use, 
                fontSize=40, 
                dotSize=1, 
                plotWidth=10, 
                plotHeight=8)
p

HYT DE modeling

de_res_hyt = runDEmodel(expr_data = v_hyt, meta_data = meta_data_hyt, 
                        form2use = form4model_hyt,
                        gene_meta_data = gene_meta_data)

# save tables
brain_region = "hyt"
fstem = sprintf("_filt%0.2f",varianceFilterCutoff)

table_names2use = c("Fstat","interaction","group","sex")
report2screen = sprintf("%s DE genes\n",brain_region)
for (itable in 1:length(table_names2use)){
  fname2save = file.path(res_path,
                         brain_region,
                         sprintf("DE.%s_table%s.csv",
                                 table_names2use[itable],
                                 fstem))
  tab2use = de_res_hyt[[sprintf("%s_table",table_names2use[itable])]]
  tab2use = tab2use[order(tab2use$adj.P.Val),]
  tab2use$DE = "NO"
  tab2use$DE[tab2use$adj.P.Val<=fdr_qThresh] = "YES"
  write.csv(tab2use, file = fname2save)
  if (itable>1){
    report2screen = c(report2screen, 
                      sprintf("%s DE genes is %d\n",table_names2use[itable],
                              sum(tab2use$adj.P.Val<=fdr_qThresh)))
  }
}
cat(report2screen)
## hyt DE genes
##  interaction DE genes is 0
##  group DE genes is 1
##  sex DE genes is 723
# make a volcano plot 

# interaction volcano plot
data2use = de_res_hyt[["interaction_table"]]
fname2save = file.path(plot_path, sprintf("DE.hyt.interaction.volcanoPlot%s.pdf",fstem))
colors2use = "grey60"
p = volcanoPlot(data2use = data2use, 
                fname2save = fname2save, 
                fdr_qThresh = fdr_qThresh, 
                plotTopGenes=FALSE,
                colors2use=colors2use, 
                fontSize=40, 
                dotSize=1, 
                plotWidth=10, 
                plotHeight=8)
p

# sex volcano plot
data2use = de_res_hyt[["sex_table"]]
fname2save = file.path(plot_path, sprintf("DE.hyt.sex.volcanoPlot%s.pdf",fstem))
colors2use = c("#ff8d1e","grey60","#1e90ff")
p = volcanoPlot(data2use = data2use, 
                fname2save = fname2save, 
                fdr_qThresh = fdr_qThresh, 
                plotTopGenes=TRUE,
                colors2use=colors2use, 
                fontSize=40, 
                dotSize=1, 
                plotWidth=10, 
                plotHeight=8)
p

# group volcano plot
data2use = de_res_hyt[["group_table"]]
fname2save = file.path(plot_path, sprintf("DE.hyt.group.volcanoPlot%s.pdf",fstem))
colors2use = c("grey60","#1e90ff")
p = volcanoPlot(data2use = data2use, 
                fname2save = fname2save, 
                fdr_qThresh = fdr_qThresh, 
                plotTopGenes=TRUE,
                colors2use=colors2use, 
                fontSize=40, 
                dotSize=1, 
                plotWidth=10, 
                plotHeight=8)
p

Plot enrichments as heatmap

poslogfc_res$type = "DE-"
neglogfc_res$type = "DE+"
poslogfc_sexhormone_res$type = "DE-"
neglogfc_sexhormone_res$type = "DE+"

df_gsea = rbind(poslogfc_res, neglogfc_res, poslogfc_sexhormone_res, neglogfc_sexhormone_res)
df_gsea$genelist = factor(df_gsea$genelist, 
                          levels = rev(c("dup15q DE+","dup15q DE-", 
                                         "SFARI", "ptLGD",
                                         "ASD DE+","ASD DE-",
                                         "SCZ DE","BD DE",
                                         "DHT DE+","DHT DE-",
                                         "EST DE+","EST DE-",
                                         "AR Targets","ER Targets",
                                         "Male DT","Female DT","SexDiv DT")))
df_gsea
##               genelist        OR         pval          fdr type
## SFARI            SFARI 1.4438127 1.769705e-02 4.719213e-02  DE-
## ptLGD            ptLGD 1.4340875 1.918718e-01 3.837435e-01  DE-
## dup15q DE+  dup15q DE+ 0.4652926 9.952573e-01 9.999477e-01  DE-
## dup15q DE-  dup15q DE- 3.7788889 4.149096e-20 3.319276e-19  DE-
## ASD DE+        ASD DE+ 0.6071054 9.999477e-01 9.999477e-01  DE-
## ASD DE-        ASD DE- 2.1676101 4.752783e-09 1.901113e-08  DE-
## SCZ DE          SCZ DE 0.9348247 9.997996e-01 9.999477e-01  DE-
## BD DE            BD DE 0.8422789 9.903868e-01 9.999477e-01  DE-
## SFARI1           SFARI 1.2148546 2.486948e-01 3.315931e-01  DE+
## ptLGD1           ptLGD 2.4260617 4.280837e-04 1.141557e-03  DE+
## dup15q DE+1 dup15q DE+ 1.5718039 1.447686e-01 2.895371e-01  DE+
## dup15q DE-1 dup15q DE- 0.3087027 9.999997e-01 9.999997e-01  DE+
## ASD DE+1       ASD DE+ 1.8783869 1.400588e-04 1.120470e-03  DE+
## ASD DE-1       ASD DE- 0.5851405 9.999574e-01 9.999997e-01  DE+
## SCZ DE1         SCZ DE 1.4541750 2.836024e-04 1.134410e-03  DE+
## BD DE1           BD DE 1.2444558 2.228412e-01 3.315931e-01  DE+
## DHT DE+        DHT DE+ 1.3394438 1.825064e-01 5.475191e-01  DE-
## DHT DE-        DHT DE- 0.1891144 9.999942e-01 1.000000e+00  DE-
## EST DE+        EST DE+ 1.9920506 1.050971e-01 4.729370e-01  DE-
## EST DE-        EST DE- 1.3250323 4.762096e-01 8.571773e-01  DE-
## AR Targets  AR Targets 0.8140267 1.000000e+00 1.000000e+00  DE-
## ER Targets  ER Targets 1.0918760 6.684622e-01 1.000000e+00  DE-
## Male DT        Male DT 0.4566315 1.000000e+00 1.000000e+00  DE-
## Female DT    Female DT 1.9977308 2.596165e-07 2.336549e-06  DE-
## SexDiv DT    SexDiv DT 1.2341137 3.067365e-01 6.901571e-01  DE-
## DHT DE+1       DHT DE+ 1.0106258 7.099297e-01 9.997345e-01  DE+
## DHT DE-1       DHT DE- 2.2034290 4.484512e-03 1.345354e-02  DE+
## EST DE+1       EST DE+ 0.4878553 9.366615e-01 9.997345e-01  DE+
## EST DE-1       EST DE- 0.3886517 9.393973e-01 9.997345e-01  DE+
## AR Targets1 AR Targets 1.7105866 7.282498e-10 6.554248e-09  DE+
## ER Targets1 ER Targets 0.6071263 9.997345e-01 9.997345e-01  DE+
## Male DT1       Male DT 2.0990700 4.238625e-08 1.907381e-07  DE+
## Female DT1   Female DT 0.6863210 9.991975e-01 9.997345e-01  DE+
## SexDiv DT1   SexDiv DT 1.4805624 1.855500e-02 4.174876e-02  DE+
fontSize = 5
p = ggplot(data = df_gsea) + 
  geom_tile(aes(y = genelist, x = type, fill= -log10(pval))) + 
  geom_text(aes(y=genelist, x=type,label = round(OR,2)), size = fontSize) + 
  scale_fill_gradient(low = "white", high="red") + ylab("")+xlab("") +
  theme(
    # Remove panel border
    panel.border = element_blank(),  
    # Remove panel grid lines
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    # Remove panel background
    panel.background = element_blank(),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize+10),
    axis.title.x = element_text(size=fontSize),
    strip.text.x = element_text(size=fontSize),
    axis.title.y = element_text(size=fontSize))
  
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.EnrichmentHeatmap%s.pdf",fstem)),width = 5, height=5)
p

Plot specific genes of interest

PLOT = FALSE
if (PLOT){
  dotSize = 10
  fontSize = 40
  # get DE genes
  mask = de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh
  genes2plot = de_res_pfc[["interaction_table"]]$ensembl_id[mask]
  gene_names = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
  
  mask = is.element(hsap_homolog_gene_meta_data$hsapiens_homolog_ensembl_gene, sfari_genes_de_overlap$ensembl.id)
  gene_names2print = hsap_homolog_gene_meta_data$external_gene_name[mask]
  
  for (igene in 1:length(genes2plot)){
    gene2plot = genes2plot[igene]
    gene_name = gene_names[igene]
    
    exprdata2plot = subset(v_pfc$E,is.element(rownames(v_pfc$E),gene2plot))
    data2plot = data.frame(meta_data_pfc, expr_data = t(exprdata2plot))
    colnames(data2plot)[dim(data2plot)[2]] = "expr_data"
    
    p = ggplot(data = data2plot, aes(x = group, y = expr_data, colour = sex)) + facet_grid(. ~ sex)
    p = p + geom_jitter(size = dotSize) + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
    p = p + ylab("log(CPM)") + xlab("") + ggtitle(gene_name) +  
        theme(axis.text.x = element_text(size=fontSize),
        axis.text.y = element_text(size=fontSize),
        axis.title.x = element_text(size=fontSize),
        strip.text.x = element_text(size=fontSize),
        axis.title.y = element_text(size=fontSize),
        plot.title = element_text(hjust = 0.5, size=fontSize))

    ggsave(filename = file.path(plot_path, sprintf("%s.pdf",gene_name)), width=10,height=8)
    
  }
}

Enrichment analysis with Allen cell types

plotWidth = 10
plotHeight = 7

# specify which databases to query
dbs = c("Allen_Brain_Atlas_10x_scRNA_2021")
dbs_names = c("Allen10x_scRNA_2021")

topNterms = 25
brain_region = "pfc"
de_genes2use = de_res_pfc[["interaction_table"]]$external_gene_name[de_res_pfc[["interaction_table"]]$adj.P.Val<=fdr_qThresh]
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<=fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC>0
de_genes2use1 = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<=fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC<0
de_genes2use2 = de_res_pfc[["interaction_table"]]$external_gene_name[mask]

if (!is_empty(de_genes2use)){
  enrichment_res = enrichr(de_genes2use, dbs)
  enrichment_res1 = enrichr(de_genes2use1, dbs)
  enrichment_res2 = enrichr(de_genes2use2, dbs)
  
  for (idb in 1:length(dbs_names)){
    
    # all DE genes
    go_res = enrichment_res[[dbs[idb]]]
    write.csv(go_res, file = file.path(res_path,
                                       brain_region,
                                       sprintf("DE.interaction.%s%s.csv",
                                               dbs_names[idb],
                                               fstem)))
    data4plot = go_res[go_res$Adjusted.P.value<=fdr_qThresh,1:(ncol(go_res)-1)]
    data4plot = data4plot[1:topNterms,]
    p = ggplot(data = data4plot, 
               aes(x = reorder(Term, -log10(P.value)), 
                   y = -log10(P.value), 
                   fill = -log10(P.value))) +
      geom_bar(stat = "identity") + 
      ylab("-log10(p)") + 
      xlab(" ") + 
      geom_hline(yintercept = -log10(0.05), linetype="dashed") +
      scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
                           limits = c(-log10(0.05),max(-log10(data4plot$P.value)))) +
      coord_flip()
    ggsave(filename = file.path(plot_path,
                                sprintf("DE.interaction.%s.%s%s.pdf",
                                        brain_region,
                                        dbs_names[idb],
                                        fstem)),
           width = plotWidth, height = plotHeight)

    # DE-
    go_res = enrichment_res1[[dbs[idb]]]
    write.csv(go_res, file = file.path(res_path,
                                       brain_region,
                                       sprintf("DE.interaction.%s.PosLogFC%s.csv",
                                               dbs_names[idb],
                                               fstem)))
    data4plot = go_res[go_res$Adjusted.P.value<=fdr_qThresh,1:(ncol(go_res)-1)]
    data4plot = data4plot[1:topNterms,]
    p = ggplot(data = data4plot, aes(x = reorder(Term, -log10(P.value)), y = -log10(P.value), fill = -log10(P.value))) + 
      geom_bar(stat = "identity") + 
      ylab("-log10(p)") + 
      xlab(" ") + 
      geom_hline(yintercept = -log10(0.05), linetype="dashed") +
      scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
                           limits = c(-log10(0.05),max(-log10(data4plot$P.value)))) +
      coord_flip()
    ggsave(filename = file.path(plot_path,
                                sprintf("DE.interaction.%s.%s.PosLogFC%s.pdf",
                                        brain_region,
                                        dbs_names[idb],
                                        fstem)),
           width = plotWidth, height = plotHeight)

    # DE+
    go_res = enrichment_res2[[dbs[idb]]]
    write.csv(go_res, file = file.path(res_path,
                                       brain_region,
                                       sprintf("DE.interaction.%s.NegLogFC%s.csv",
                                               dbs_names[idb],
                                               fstem)))
    data4plot = go_res[go_res$Adjusted.P.value<=fdr_qThresh,1:(ncol(go_res)-1)]
    data4plot = data4plot[1:topNterms,]
    p = ggplot(data = data4plot, aes(x = reorder(Term, -log10(P.value)), 
                                     y = -log10(P.value), 
                                     fill = -log10(P.value))) +
      geom_bar(stat = "identity") + 
      ylab("-log10(p)") + 
      xlab(" ") + 
      geom_hline(yintercept = -log10(0.05), linetype="dashed") +
      scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
                           limits = c(-log10(0.05),max(-log10(data4plot$P.value)))) +
      coord_flip()
    ggsave(filename = file.path(plot_path,
                                sprintf("DE.interaction.%s.%s.NegLogFC%s.pdf",
                                        brain_region,
                                        dbs_names[idb],
                                        fstem)),
           width = plotWidth, height = plotHeight)
    
  } # for (idb in 1:length(dbs_names)){

} # if (!is_empty(de_genes2use)){
## Uploading data to Enrichr... Done.
##   Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
##   Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
##   Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
## Parsing results... Done.

Annotate Allen cell type enrichments

brain_region = "pfc"
fnames2loop  = c("DE.interaction.Allen10x_scRNA_2021","DE.interaction.Allen10x_scRNA_2021.NegLogFC",
                 "DE.interaction.Allen10x_scRNA_2021.PosLogFC")

for (i in 1:length(fnames2loop)){
  celltype_data = read.csv(file.path(res_path, brain_region,sprintf("%s_filt0.15.csv",fnames2loop[i])))
  celltype_data = annotate_celltypes(celltype_data)
  celltype_data = subset(celltype_data, celltype_data$Adjusted.P.value<=fdr_qThresh)
  celltype_data = celltype_data[order(celltype_data$class, decreasing = FALSE),]

  write.csv(celltype_data, file = file.path(res_path, brain_region,sprintf("%s_FINAL_filt0.15.csv",fnames2loop[i])))
}# for (i in 1:length(fnames2loop)){

# -----------------------------------------------------------------------------
# plot DE neg and pos log FC direction in one plot
i=2
celltype_data = read.csv(file.path(res_path, brain_region,sprintf("%s_filt0.15.csv",fnames2loop[i])))
celltype_data = annotate_celltypes(celltype_data)
de_neg_celltypes = subset(celltype_data, celltype_data$Adjusted.P.value<=fdr_qThresh)
de_neg_celltypes$direction = "DE+" #"Neg log2(FC)"

i=3
celltype_data = read.csv(file.path(res_path, brain_region,sprintf("%s_filt0.15.csv",fnames2loop[i])))
celltype_data = annotate_celltypes(celltype_data)
de_pos_celltypes = subset(celltype_data, celltype_data$Adjusted.P.value<=fdr_qThresh)
de_pos_celltypes$direction = "DE-" # "Pos log2(FC)"

de_celltypes = rbind(de_pos_celltypes, de_neg_celltypes)
de_celltypes$direction = factor(de_celltypes$direction, levels = c("DE+","DE-"))

# get non-duplicated cluster IDs and final_label2
tmp = de_celltypes[,c("cluster_id","final_label2","class_label","plotcolor2use")]
tmp = tmp[!duplicated(tmp),]
tmp = tmp[order(tmp$cluster_id),]
tmp$cluster_id = factor(tmp$cluster_id)
tmp$final_label2 = factor(tmp$final_label2)
tmp$plotcolor2use = factor(tmp$plotcolor2use)
tmp$class_label = factor(tmp$class_label)
tmp = tmp %>% mutate(
  name = glue("<b style='color:{plotcolor2use}'>{final_label2}</b>"),
  name = fct_reorder(name, -as.numeric(cluster_id))
)

# turn cluster id into a factor and then reverse the ordering of the levels
de_celltypes$cluster_id = factor(de_celltypes$cluster_id)
de_celltypes$cluster_id = fct_rev(de_celltypes$cluster_id)

df2plot = merge(de_celltypes, tmp[,c("final_label2","name")], by = "final_label2")

p = ggplot(data = df2plot, aes(x = name, y = -log10(P.value), fill=class_label)) +
  facet_grid(. ~ direction) +
  geom_bar(stat = "identity") +
  # scale_x_discrete(limits = rev(levels(tmp$cluster_id)), labels = rev(tmp$final_label2)) +
  xlab("Cell Types") + ylab("-log10(p)") + coord_flip() +  ggtitle(" ") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.y = element_markdown())
ggsave(filename = file.path(plot_path, "DE.celltypes.pdf"), height=12, width = 12)
p

p = ggplot(data = df2plot, aes(x = direction, y = name), fill = class_label) +
  geom_tile(aes(x = direction, y = name, fill= -log10(P.value))) +
  geom_text(aes(x = direction, y = name, label = round(Odds.Ratio,2))) +
  scale_fill_gradient(low = "white", high="red") + ylab("")+xlab("") +
  theme(
    # Remove panel border
    panel.border = element_blank(),
    # Remove panel grid lines
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    # Remove panel background
    panel.background = element_blank(),
    axis.text.y = element_markdown())
ggsave(filename = file.path(plot_path, "DE.celltypes.Heatmap.pdf"), width=6, height=12)
p